In [1]:
%matplotlib inline
import matplotlib
import numpy as np
from matplotlib import pyplot as plt
from sklearn.cluster import KMeans
from scipy.cluster.hierarchy import dendrogram, linkage
from scipy.cluster.hierarchy import cophenet
from scipy.spatial.distance import pdist

Disjoint clusters

generate random data


In [2]:
np.random.seed(5) # random seed for consistency
N = 4
k = 4

cov_mat = np.eye(2)/10

X1 = np.vstack([np.random.multivariate_normal([-1, -1], cov_mat, N),
                np.random.multivariate_normal([-1, +1], cov_mat, N),
                np.random.multivariate_normal([+1, -1], cov_mat, N),
                np.random.multivariate_normal([+1, +1], cov_mat, N)])
cols1 = [0] * N + [1] * N + [2] * N + [3] * N

plot data with original cluster colors


In [3]:
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1,1,1)
ax.scatter(X1[:,0], X1[:,1], s=10, c='grey', alpha=1, linewidth=0)
for i in xrange(X1.shape[0]):
    ax.text(X1[i, 0], X1[i, 1], '%s' % i)
plt.show()



In [4]:
Z1_single = linkage(X1, method='single', metric='euclidean')
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1,1,1)
dendrogram(Z1_single, leaf_rotation=90, leaf_font_size=8, ax=ax)
ax.set_title('Single link, Euclidean distances')
plt.show()



In [5]:
Z1_average = linkage(X1, method='average', metric='euclidean')
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1,1,1)
dendrogram(Z1_average, leaf_rotation=90, leaf_font_size=8, ax=ax)
ax.set_title('Average link, Euclidean distances')
plt.show()


Example with two concentric rings


In [6]:
np.random.seed(1) # random seed for consistency
N = 200

theta = np.random.uniform(size=(2*N, 1)) * 2 * np.pi
r     = np.vstack([np.random.uniform(low=0.8, high=1.2, size=(N, 1)),
                   np.random.uniform(low=1.8, high=2.2, size=(N, 1))])

x = np.multiply(r, np.cos(theta))
y = np.multiply(r, np.sin(theta))
X2 = np.hstack([x, y])

In [7]:
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1,1,1)
ax.scatter(X2[:,0], X2[:,1], s=10, c='grey', alpha=1, linewidth=0)
#for i in xrange(X2.shape[0]):
#    ax.text(X2[i, 0], X2[i, 1], '%s' % i)
plt.show()


performing K-means with K=2


In [8]:
Z2_single = linkage(X2, method='single', metric='euclidean')
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1,1,1)
dendrogram(Z2_single, leaf_rotation=90, leaf_font_size=8, ax=ax)
ax.set_title('Single link, Euclidean distances')
plt.show()


Scatter plot of the clustering results


In [9]:
Z2_average = linkage(X2, method='average', metric='euclidean')
fig = plt.figure(figsize=(5,5))
ax = fig.add_subplot(1,1,1)
dendrogram(Z2_average, leaf_rotation=90, leaf_font_size=8, ax=ax)
ax.set_title('Average link, Euclidean distances')
plt.show()